Goal: Compare super claims 1, 3, and 5. 1 : Not Happening 3: Climate Impacts Not Bad 5: Science/Scientist Not Reliable
knitr::opts_chunk$set(echo = TRUE)
library(jsonlite) # allows us to read in json files
library(tidyverse) # allows us to do lots of data manipulation and basic data science
library(here) # allows us to cut out long file paths (ex. "users/connor/dowloads/etc")
library(forcats) #
library(tidytext) # allows us to tokenize data
library(dplyr) # allows us to manipulate dataframes
library(stringr) # allows us to count the number of words in a cell
library(quanteda) # allows us to tokenize data
library(quanteda.textplots) # allows us to make network plots
library(gridExtra) # allows us to combine multiple plots into 1
library(wordcloud) # allows us to generate word clouds
library(fmsb)
library(plotly) #interactive ggplot graphs
library(ggthemes) # more themes for ggplot
library(tm) #for textmining corpus(), removePunctuation()
library(syuzhet) # for sentiment analysis, getNrc()
library(wordcloud2) # for comparison clouds
library(plotrix) # for pyramid plots
library(RColorBrewer) # for more color palettes
nature_analysis <- read_csv(here("data/training.csv"))
## Rows: 23436 Columns: 2
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): text, claim
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
#Super Claim 1 Not Happening Filter() to select super claim 1
na_1 <- nature_analysis %>%
filter(str_detect(claim, "1_"))
Add word_count column using mutate()
na_1 <- na_1 %>%
mutate(word_count = str_count(na_1$text, "\\S+"))
Distribution visual, geom_histogram
ggplot(na_1, aes(x = word_count, fill = claim)) +
geom_histogram(bins = 67, color = "black") +
theme(text = element_text(family = "Menlo-Bold", size = 12),
legend.title = element_text(family = "Menlo-Bold", size = 12)) +
labs(title = "Distribution of Claims",
subtitle = "Claim 1")
Tokenize using unnest_tokens() to seprate text into words
na_1_tokenized <- na_1 %>%
unnest_tokens(words, text)
na_1_tokenized <- na_1_tokenized %>%
count(words) %>%
arrange(desc(n))
Filter() out stopwords()
na_1_tokenized <- na_1_tokenized %>%
filter(!words %in% stopwords("english"))
Word Cloud 1
wordcloud(na_1_tokenized$words, freq = na_1_tokenized$n, max.words = 200, min.freq = 5, random.order = FALSE, colors = c("royalblue1","seagreen2", "orangered"), family = "Avenir")
na_1_corpus <- corpus(na_1$text)
toks <- na_1_corpus %>%
tokens(remove_punct = TRUE) %>%
tokens_tolower() %>%
tokens_remove(pattern = stopwords("english"), padding = FALSE)
fcmat <- fcm(toks, context = "window", tri = FALSE)
feat <- names(topfeatures(fcmat, 30))
fcm_select(fcmat, pattern = feat) %>%
textplot_network(min_freq = 0.5)
na_1_claims <- na_1 %>%
select(text)
ngrams <- na_1_claims %>%
unnest_tokens(bigram, text, token = "ngrams", n = 2)
ngrams <- ngrams %>%
separate(bigram, c("word1", "word2"), sep = " ")
ngrams <- ngrams %>%
filter(!word1 %in% stop_words$word) %>%
filter(!word2 %in% stop_words$word)
ngrams <- ngrams %>%
unite(bigram, word1, word2, sep=" ")
ngrams_1 <- ngrams %>%
count(bigram, sort = TRUE)
na_1_claims_vector <- as.character(na_1_claims$text)
na_1_sentiment <- get_nrc_sentiment(na_1_claims_vector)
## Warning: `spread_()` was deprecated in tidyr 1.2.0.
## ℹ Please use `spread()` instead.
## ℹ The deprecated feature was likely used in the syuzhet package.
## Please report the issue to the authors.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
na_1_sentiment_score <- data.frame(colSums(na_1_sentiment[,]))
names(na_1_sentiment_score) <- 'score'
na_1_sentiment_score <- cbind("sentiment" = rownames(na_1_sentiment_score), na_1_sentiment_score)
#rownames(training_sentiment_score) <- NULL
ggplot(na_1_sentiment_score, aes(x = sentiment, y = score)) +
geom_bar(aes(fill = sentiment), stat="identity") +
labs(x = "Sentiments", y = "Scores", title = "Sentiment for Super Claim 1")
#Super Claim 3 Climate Impacts Not Bad Filter() for super claim 3
na_3 <- nature_analysis %>%
filter(str_detect(claim, "3_"))
Add word_count column using mutate()
na_3 <- na_3 %>%
mutate(word_count = str_count(na_3$text, "\\S+"))
Distribution visual, geom_histogram
ggplot(na_3, aes(x = word_count, fill = claim)) +
geom_histogram(bins = 67, color = "black") +
theme(text = element_text(family = "Menlo-Bold", size = 12),
legend.title = element_text(family = "Menlo-Bold", size = 12)) +
labs(title = "Distribution of Claims",
subtitle = "Claim 3")
Tokenize using unnest_tokens()
na_3_tokenized <- na_3 %>%
unnest_tokens(words, text)
na_3_tokenized <- na_3_tokenized %>%
count(words) %>%
arrange(desc(n))
Filter() out stopwords()
na_3_tokenized <- na_3_tokenized %>%
anti_join(stop_words, by = c("words" = "word")) %>%
filter(!words %in% c("et", "al", "2"))
Word Cloud 3
wordcloud(na_3_tokenized$words, freq = na_3_tokenized$n, max.words = 200, min.freq = 5, random.order = FALSE, random.color = FALSE, colors = brewer.pal(12, "Paired"))
na_3_corpus <- corpus(na_3$text)
toks <- na_3_corpus %>%
tokens(remove_punct = TRUE) %>%
tokens_tolower() %>%
tokens_remove(pattern = stopwords("english"), padding = FALSE)
fcmat <- fcm(toks, context = "window", tri = FALSE)
feat <- names(topfeatures(fcmat, 30))
fcm_select(fcmat, pattern = feat) %>%
textplot_network(min_freq = 0.5)
na_3_claims <- na_3 %>%
select(text)
ngrams_3 <- na_3_claims %>%
unnest_tokens(bigram, text, token = "ngrams", n = 2)
ngrams_3 <- ngrams_3 %>%
separate(bigram, c("word1", "word2"), sep = " ")
ngrams_3 <- ngrams_3 %>%
filter(!word1 %in% stop_words$word) %>%
filter(!word2 %in% stop_words$word)
ngrams_3 <- ngrams_3 %>%
unite(bigrams, word1, word2, sep = " ")
ngrams_3 <- ngrams_3 %>%
count(bigrams, sort = TRUE)
na_3_claims_vector <- as.character(na_3_claims$text)
na_3_sentiment <- get_nrc_sentiment(na_3_claims_vector)
na_3_sentiment_score <- data.frame(colSums(na_3_sentiment[,]))
names(na_3_sentiment_score) <- 'score'
na_3_sentiment_score <- cbind("sentiment"=rownames(na_3_sentiment_score), na_3_sentiment_score)
#rownames(training_sentiment_score) <- NULL
ggplot(na_3_sentiment_score, aes(x = sentiment, y = score)) +
geom_bar(aes(fill = sentiment), stat="identity") +
labs(x = "Sentiments", y = "Scores", title = "Sentiment for Super Claim 3")
#Super Claim 5 Science/Scientist Not Reliable Filter() for super claim 5
na_5 <- nature_analysis %>%
filter(str_detect(claim, "5_"))
Add word_count column using mutate()
na_5 <- na_5 %>%
mutate(word_count = str_count(na_5$text, "\\S+"))
Distribution visual, geom_histogram
ggplot(na_5, aes(x = word_count, fill = claim)) +
geom_histogram(bins = 67, color = "black") +
theme(text = element_text(family = "Menlo-Bold", size = 12),
legend.title = element_text(family = "Menlo-Bold", size = 12)) +
labs(title = "Distribution of Claims",
subtitle = "Claim 5")
Tokenize using unnest_tokens()
na_5_tokenized <- nature_analysis %>%
unnest_tokens(words, text)
na_5_tokenized <- na_5_tokenized %>%
count(words) %>%
arrange(desc(n))
Filter() out stopwords()
na_5_tokenized <- na_5_tokenized %>%
filter(!words %in% stopwords("english"))
Word Cloud 5
wordcloud(na_5_tokenized$words, freq = na_5_tokenized$n, max.words = 200, min.freq = 5, random.order = FALSE, random.color = FALSE, color = brewer.pal(12, "Paired"))
na_5_corpus <- corpus(na_5$text)
toks <- na_5_corpus %>%
tokens(remove_punct = TRUE) %>%
tokens_tolower() %>%
tokens_remove(pattern = stopwords("english"), padding = FALSE)
fcmat <- fcm(toks, context = "window", tri = FALSE)
feat <- names(topfeatures(fcmat, 30))
fcm_select(fcmat, pattern = feat) %>%
textplot_network(min_freq = 0.5)
na_5_claims <- na_5 %>%
select(text)
ngrams_5 <- na_5_claims %>%
unnest_tokens(bigram, text, token = "ngrams", n = 2)
ngrams_5 <- ngrams_5 %>%
separate(bigram, c("word1", "word2"), sep = " ")
ngrams_5 <- ngrams_5 %>%
filter(!word1 %in% stop_words$word) %>%
filter(!word2 %in% stop_words$word)
ngrams_5 <- ngrams_5 %>%
unite(bigram, word1, word2, sep = " ")
ngrams_5 <- ngrams_5 %>%
count(bigram, sort = TRUE)
na_5_claims_vector <- as.character(na_5_claims$text)
na_5_claims_sentiment <- get_nrc_sentiment(na_5_claims_vector)
na_5_claims_sentiment_score <- data.frame(colSums(na_5_claims_sentiment[,]))
names(na_5_claims_sentiment_score) <- 'score'
na_5_claims_sentiment_score <- cbind("sentiment" = rownames(na_5_claims_sentiment_score), na_5_claims_sentiment_score)
#rownames(training_sentiment_score) <- NULL
ggplot(na_5_claims_sentiment_score, aes(sentiment, score)) +
geom_bar(aes(fill = sentiment), stat = "identity") +
labs(x = "Sentiment", y = "Score", title = "Sentiment for Super Claim 5")
#Super Claim 5_1 Filter() for super claim 5
na_5_1 <- nature_analysis %>%
filter(str_detect(claim, "5_1"))
Add word_count column using mutate()
na_5_1 <- na_5_1 %>%
mutate(word_count = str_count(na_5_1$text, "\\S+"))
Tokenize using unnest_tokens()
na_5_1_tokenized <- nature_analysis %>%
unnest_tokens(words, text)
na_5_1_tokenized <- na_5_1_tokenized %>%
count(words) %>%
arrange(desc(n))
Filter() out stopwords()
na_5_1_tokenized <- na_5_1_tokenized %>%
filter(!words %in% stopwords("english"))
wordcloud(na_5_1_tokenized$words, freq = na_5_1_tokenized$n, max.words = 200, min.freq = 5, random.order = FALSE, random.color = FALSE, color = brewer.pal(12, "Paired"))
na_5_1_corpus <- corpus(na_5_1$text)
toks <- na_5_1_corpus %>%
tokens(remove_punct = TRUE) %>%
tokens_tolower() %>%
tokens_remove(pattern = stopwords("english"), padding = FALSE)
fcmat <- fcm(toks, context = "window", tri = FALSE)
feat <- names(topfeatures(fcmat, 30))
fcm_select(fcmat, pattern = feat) %>%
textplot_network(min_freq = 0.5)
na_5_1_claims <- na_5_1 %>%
select(text)
ngrams_5_1 <- na_5_1_claims %>%
unnest_tokens(bigram, text, token = "ngrams", n = 2)
ngrams_5_1 <- ngrams_5_1 %>%
separate(bigram, c("word1", "word2"), sep = " ")
ngrams_5_1 <- ngrams_5_1 %>%
filter(!word1 %in% stop_words$word) %>%
filter(!word2 %in% stop_words$word)
ngrams_5_1 <- ngrams_5_1 %>%
unite(bigram, word1, word2, sep = " ")
ngrams_5_1 <- ngrams_5_1 %>%
count(bigram, sort = TRUE)
na_5_1_claims_vector <- as.character(na_5_1_claims$text)
na_5_1_claims_sentiment <- get_nrc_sentiment(na_5_1_claims_vector)
na_5_1_claims_sentiment_score <- data.frame(colSums(na_5_1_claims_sentiment[,]))
names(na_5_1_claims_sentiment_score) <- 'score'
na_5_1_claims_sentiment_score <- cbind("sentiment" = rownames(na_5_1_claims_sentiment_score), na_5_1_claims_sentiment_score)
#rownames(training_sentiment_score) <- NULL
ggplot(na_5_1_claims_sentiment_score, aes(sentiment, score)) +
geom_bar(aes(fill = sentiment), stat = "identity") +
labs(x = "Sentiment", y = "Score", title = "Sentiment for Super Claim 5_1")
#Super Claim 5_2 Filter() for super claim 5
na_5_2 <- nature_analysis %>%
filter(str_detect(claim, "5_2"))
Add word_count column using mutate()
na_5_2 <- na_5_2 %>%
mutate(word_count = str_count(na_5_2$text, "\\S+"))
Tokenize using unnest_tokens()
na_5_2_tokenized <- nature_analysis %>%
unnest_tokens(words, text)
na_5_2_tokenized <- na_5_2_tokenized %>%
count(words) %>%
arrange(desc(n))
Filter() out stopwords()
na_5_2_tokenized <- na_5_2_tokenized %>%
filter(!words %in% stopwords("english"))
wordcloud(na_5_2_tokenized$words, freq = na_5_2_tokenized$n, max.words = 200, min.freq = 5, random.order = FALSE, random.color = FALSE, color = brewer.pal(12, "Paired"))
na_5_2_corpus <- corpus(na_5_2$text)
toks <- na_5_2_corpus %>%
tokens(remove_punct = TRUE) %>%
tokens_tolower() %>%
tokens_remove(pattern = stopwords("english"), padding = FALSE)
fcmat <- fcm(toks, context = "window", tri = FALSE)
feat <- names(topfeatures(fcmat, 30))
fcm_select(fcmat, pattern = feat) %>%
textplot_network(min_freq = 0.5)
na_5_2_claims <- na_5_2 %>%
select(text)
ngrams_5_2 <- na_5_2_claims %>%
unnest_tokens(bigram, text, token = "ngrams", n = 2)
ngrams_5_2 <- ngrams_5_2 %>%
separate(bigram, c("word1", "word2"), sep = " ")
ngrams_5_2 <- ngrams_5_2 %>%
filter(!word1 %in% stop_words$word) %>%
filter(!word2 %in% stop_words$word)
ngrams_5_2 <- ngrams_5_2 %>%
unite(bigram, word1, word2, sep = " ")
ngrams_5_2 <- ngrams_5_2 %>%
count(bigram, sort = TRUE)
na_5_2_claims_vector <- as.character(na_5_2_claims$text)
na_5_2_claims_sentiment <- get_nrc_sentiment(na_5_2_claims_vector)
na_5_2_claims_sentiment_score <- data.frame(colSums(na_5_2_claims_sentiment[,]))
names(na_5_2_claims_sentiment_score) <- 'score'
na_5_2_claims_sentiment_score <- cbind("sentiment" = rownames(na_5_2_claims_sentiment_score), na_5_2_claims_sentiment_score)
#rownames(training_sentiment_score) <- NULL
ggplot(na_5_2_claims_sentiment_score, aes(sentiment, score)) +
geom_bar(aes(fill = sentiment), stat = "identity") +
labs(x = "Sentiment", y = "Score", title = "Sentiment for Super Claim 5_2")
#GGPPLOT comparisons
united_sent_score <- full_join(na_1_sentiment_score, na_3_sentiment_score, by = "sentiment") %>%
full_join(na_5_claims_sentiment_score, by = "sentiment") %>% rename(claim_1 = score.x, claim_3 = score.y, claim_5 = score) #%>%
#transmute(sentiment, score = score.x + score.y + score)
ggplot(united_sent_score, aes(x = sentiment)) +
geom_col(aes(y = claim_5, fill = "claim_5"), position = "stack") +
geom_col(aes(y = claim_1, fill = "claim_1"), position = "stack") +
geom_col(aes(y = claim_3, fill = "claim_3"), position = "stack") +
#coord_flip() +
#theme_wsj() +
theme_minimal()+
scale_fill_manual(values = c("claim_5" = "#08589E", "claim_1" = "#4EB3D3", "claim_3" = "#A8DDB5")) +
#scale_fill_manual(values = c("claim_5" = "#FF7F0E", "claim_1" = "#2CA02C", "claim_3" = "#1F77B4")) +
theme(text = element_text(family = "Arial", size = 21), axis.text.y = element_text(family = "Arial", size = 21), axis.title = element_text(family = "Arial", size = 30), plot.title = element_text(hjust = 0.5),
plot.subtitle = element_text(hjust = 0.5))+
labs(x = "Sentiment", y = "Score", title = "Sentiment Scores", subtitle = "Super Claims 1, 3, & 5", fill = "Claim")
#Word Cloud
wordcloud(ngrams_1$bigram, freq = ngrams_1$n, max.words = 200, min.freq = 5, random.order = FALSE, colors = c("royalblue1","seagreen2", "orangered"), family = "Avenir")
wordcloud(ngrams_3$bigrams, freq = ngrams_3$n, max.words = 200, min.freq = 5, random.order = FALSE, colors = c("royalblue1","seagreen2", "orangered"), family = "Avenir")
wordcloud(ngrams_5$bigram, freq = ngrams_5$n, max.words = 200, min.freq = 5, random.order = FALSE, color = brewer.pal(8, "Spectral"), family = "Avenir")
wordcloud(ngrams_5_1$bigram, freq = na_5_1_tokenized$n, max.words = 200, min.freq = 5, random.order = FALSE, color = brewer.pal(8, "Dark2"), family = "Avenir")
wordcloud(ngrams_5_2$bigram, freq = na_5_2_tokenized$n, max.words = 200, min.freq = 5, random.order = FALSE, color = brewer.pal(8, "Set1"), family = "Avenir")
#Bigram Frequencey Correlation
ngrams_3_corpus <- corpus(ngrams_3$bigrams)
toks <- na_3_corpus %>%
tokens(remove_punct = TRUE) %>%
tokens_tolower() %>%
tokens_remove(pattern = stopwords("english"), padding = FALSE)
fcmat <- fcm(toks, context = "window", tri = FALSE)
feat <- names(topfeatures(fcmat, 30))
fcm_select(fcmat, pattern = feat) %>%
textplot_network(min_freq = 0.5)
#Pyramid Plot
pyramid.plot(na_1_sentiment_score$score, na_3_sentiment_score$score, labels = na_1_sentiment_score$sentiment, main = "Sentiment Comparison for Claims 1 & 3", gap = 300, top.labels = c("Super Claim 1: CC is Not Happening", "Sentiment", "Super Claim 3: CC is Not Bad"), show.values = TRUE, unit = "Score", ppmar = c(8,4,8,4))
## 3363 3363
## [1] 5.1 4.1 4.1 2.1
pyramid.plot(na_1_sentiment_score$score, na_5_claims_sentiment_score$score, labels = na_1_sentiment_score$sentiment, main = "Sentiment Comparison for Claims 1 and 5", top.labels = c("Super Claim 1: CC is Not Happening", "Sentiment", "Super Claim 5: Science/Scientist Not Reliable"), gap = 450, show.values = TRUE, unit = "Score", ppmar = c(8,4,8,4))
## 6254 6254
## [1] 5.1 4.1 4.1 2.1
pyramid.plot(na_3_sentiment_score$score, na_5_claims_sentiment_score$score, labels = na_3_sentiment_score$sentiment, main = "Sentiment Comparison for Claims 3 and 5", top.labels = c("Super Claim 3: CC is Not Bad", "Sentiment", "Super Claim 5: Science/Scientist Not Reliable"), gap = 500, show.values = TRUE, unit = "Score", ppmar = c(8,4,8,4))
## 6254 6254
## [1] 5.1 4.1 4.1 2.1
right_join() to combine sentiment scores for super-claims 1 and 3 select() relevant columns to rename() transmute() to select “Sentiment” column and creat a new column named “Score” that is the sums of Score_1 + Score_3
sentiment_1_3 <- right_join(na_1_sentiment_score, na_3_sentiment_score, by = "sentiment", keep = TRUE)
sentiment_1_3 <- sentiment_1_3 %>%
select(sentiment.x, score.x, score.y) %>%
rename(Sentiment = sentiment.x, Score_1 = score.x, Score_3 = score.y) %>%
ungroup() %>%
transmute(Sentiment, Score = Score_1 + Score_3)
pyramid.plot(sentiment_1_3$Score, na_5_claims_sentiment_score$score, labels = na_5_claims_sentiment_score$sentiment, main = "Sentiment Comparison for Combined Claims 1 + 3 & 5", top.labels = c("Super Claim 1 & 3", "Sentiment", "Super Claim 5: Science/Scientist Not Reliable"), gap = 500, show.values = TRUE, unit = "Score", ppmar = c(8,4,8,4))
## 6254 6254
## [1] 5.1 4.1 4.1 2.1
pyramid.plot(na_5_1_claims_sentiment_score$score, na_5_2_claims_sentiment_score$score, labels = na_5_1_claims_sentiment_score$sentiment, main = "Sentiment Comparison for Super Claim 5", top.labels = c("Super Claim 5_1:Science is Unreliable", "Sentiment", "Super Claim 5_2:Movement is Unreliable"), gap = 450, show.values = TRUE, unit = "Score", ppmar = c(8,4,8,4))
## 3525 3525
## [1] 5.1 4.1 4.1 2.1
#Comparison Cloud Plot
#Word Associate Plot////NEEDS JAVA
#Other